1 Introduction

During the quality control (QC) steps, we have accumulated different doublet annotations:

  • Experimental: cell hashing, which we consider to be gold-standard.
  • Computational: scrublet, which we consider to be silver-standard.
  • Identifying cells with extreme library sizes or number of detected genes.

Although we consider cell hashing as ground-truth, it has some limitations. First, it cannot detect intra-batch doublets (doublets that share the same hashtag). Second, we observed a marked variability in hashing efficiency across libraries (measured with the signal-to-noise ratio), which suggests that for some libraries the detection was not perfect. Finally, as we wanted to measure the effect of hashing on transcriptional profiles, we included several non-hashed libraries, for which we have no experimental doublet annotation.

To overcome these issues, we ran scrublet, which predicts doublets computationally. Although it cannot find homotypic doublets (composed of cells that share the same transcriptional state), it will help us to accumulate independent sources of evidence that we can visualize downstream.

Of particular note, both cell hashing and scrublet were run for each library independently. Thus, we aimed to combined both approaches in a single metric that can consider all cells in the dataset, hence increasing the statistical power to detect doublets. Here, we calculate the proportion of doublet nearest neighbors (pDNN) using the KNN graph computed in the previous notebook. This metric is inspired in the proportion of artificial nearest neighbors (pANN) described in the DoubletFinder article. Following this principle, we expect to obtain a bimodal pDNN distribution that allows us to easily separate singlets and homotypic doublets from heterotypic doublets.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Matrix)
library(tidyverse)

2.2 Parameters

# Paths
path_to_knn <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/integrated_knn_graph.rds")
path_to_doubl_annot <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/doublet_preliminary_annotations.rds")
path_to_save <- here::here("scRNA-seq/2-QC/5-batch_effect_correction/1-doublet_exclusion/tmp/doublet_final_annotations_and_pDNN.rds")


# Functions
source(here::here("scRNA-seq/bin/utils.R"))


# Thresholds
k <- 75

2.3 Load data

knn_graph <- readRDS(path_to_knn)
doublet_annot <- readRDS(path_to_doubl_annot)

3 Compare doublet annotations

For a fair comparison, we will only compare doublet annotations for hashed cells:

is_hashed <- doublet_annot$HTO_classification.global != "NA"
doubl_annot_hashing <- doublet_annot$HTO_classification.global[is_hashed]
doubl_annot_scrublet <- doublet_annot$scrublet_predicted_doublet[is_hashed]
tabled_annotations <- table(
  hashing = factor(doubl_annot_hashing, c("Singlet", "Doublet")),
  scrublet = doubl_annot_scrublet
)
tabled_annotations
##          scrublet
## hashing    FALSE   TRUE
##   Singlet 199585  17028
##   Doublet  59041  21536

As we can see, the overlap between scrublet and cell hashing is relatively small. This is expected, since these two approaches have complementary limitations (discussed above).

4 Calculate and visualize pDNN

4.1 Hashing only

# Order levels HTO classification for later plots
doublet_annot$HTO_classification.global <- factor(
  doublet_annot$HTO_classification.global,
  levels = c("Singlet", "Doublet", "NA")
)
levels(doublet_annot$HTO_classification.global) <- c(
  "Singlet",
  "Doublet",
  "Not Hashed"
)


# Calculate pDNN
doublets_hashing <- which(doublet_annot$HTO_classification.global == "Doublet")
knn_graph_hashing <- knn_graph[, doublets_hashing]
doublet_annot$pDNN_hashing <- Matrix::rowSums(knn_graph_hashing) / k


# Plot
label <- "pDNN (cell hashing)"
pDNN_hashing_hist <- doublet_annot %>%
  plot_histogram_doublets(x = "pDNN_hashing", x_lab = label, bins = 30)
pDNN_hashing_density <- doublet_annot %>%
  plot_density_doublets(
    x = "pDNN_hashing",
    x_lab = label,
    color = "HTO_classification.global",
    color_lab = "HTO classification"
  )
pDNN_hashing_boxplot <- doublet_annot %>%
  plot_boxplot_doublets(
    x = "HTO_classification.global",
    y = "pDNN_hashing",
    fill = "HTO_classification.global",
    y_lab = label
  )
scrublet_label <- "Doublet Score (scrublet)"
pDNN_hashing_scatter <- doublet_annot %>%
  plot_scatter_doublets(
    x = "scrublet_doublet_scores",
    y = "pDNN_hashing",
    x_lab = scrublet_label,
    y_lab = label
  )


# Show plots
pDNN_hashing_hist

pDNN_hashing_density

pDNN_hashing_boxplot

pDNN_hashing_scatter

4.2 Scrublet only

# Calculate pDNN
doublets_scrublet <- which(doublet_annot$scrublet_predicted_doublet)
knn_graph_scrublet <- knn_graph[, doublets_scrublet]
doublet_annot$pDNN_scrublet <- Matrix::rowSums(knn_graph_scrublet) / k


# Plot
label <- "pDNN (scrublet)"
pDNN_scrublet_hist <- doublet_annot %>%
  plot_histogram_doublets(x = "pDNN_scrublet", x_lab = label, bins = 30)
pDNN_scrublet_density <- doublet_annot %>%
  plot_density_doublets(
    x = "pDNN_scrublet",
    x_lab = label,
    color = "HTO_classification.global",
    color_lab = "HTO classification"
  )
pDNN_scrublet_boxplot <- doublet_annot %>%
  plot_boxplot_doublets(
    x = "HTO_classification.global",
    y = "pDNN_scrublet",
    fill = "HTO_classification.global",
    y_lab = label
  )
pDNN_scrublet_scatter <- doublet_annot %>%
  plot_scatter_doublets(
    x = "scrublet_doublet_scores",
    y = "pDNN_scrublet",
    x_lab = scrublet_label,
    y_lab = label
  )
pDNN_scrublet_hashing <- doublet_annot %>%
  plot_scatter_doublets(
    x = "pDNN_hashing",
    y = "pDNN_scrublet",
    x_lab = "pDNN (cell hashing)",
    y_lab = label
  )


# Show plots
pDNN_scrublet_hist

pDNN_scrublet_density

pDNN_scrublet_boxplot

pDNN_scrublet_scatter

pDNN_scrublet_hashing

4.3 Union

# Calculate pDNN
all_doublets <-
  doublet_annot$HTO_classification.global == "Doublet" |
  doublet_annot$scrublet_predicted_doublet
doublets_union <- which(all_doublets)
knn_graph_union <- knn_graph[, doublets_union]
doublet_annot$pDNN_union <- Matrix::rowSums(knn_graph_union) / k


# Plot
label <- "pDNN (cell hashing or scrublet)"
pDNN_union_hist <- doublet_annot %>%
  plot_histogram_doublets(x = "pDNN_union", x_lab = label, bins = 30)
pDNN_union_density <- doublet_annot %>%
  plot_density_doublets(
    x = "pDNN_union",
    x_lab = label,
    color = "HTO_classification.global",
    color_lab = "HTO classification"
  )
pDNN_union_boxplot <- doublet_annot %>%
  plot_boxplot_doublets(
    x = "HTO_classification.global",
    y = "pDNN_union",
    fill = "HTO_classification.global",
    y_lab = label
  )
pDNN_union_scatter <- doublet_annot %>%
  plot_scatter_doublets(
    x = "scrublet_doublet_scores",
    y = "pDNN_union",
    x_lab = scrublet_label,
    y_lab = label
  )


# Show plots
pDNN_union_hist

pDNN_union_density

pDNN_union_boxplot

pDNN_union_scatter

Interestingly, considering both annotations yields a better separation of singlets/homotypic doublets and heterotypic doublets.

5 Exclude doublets

Initially, we will be very permissive and only exclude doublets annotated with cell hashing. We will keep both the pDNN and scrublet variables in the metadata, which we will leverage downstream to rule out doublets at the cluster level. This approach will help us both remove doublets and understand its impacts on the data.

table(doublet_annot$HTO_classification.global == "Doublet")
## 
##  FALSE   TRUE 
## 266685  80577

In total, we will eliminate 80577 doublets.

Finally, we will keep the cells that have and outlier library size, as they could represent a specific cell type.

table(doublet_annot$has_high_lib_size)
## 
##  FALSE   TRUE 
## 344761   2501

6 Save

saveRDS(doublet_annot, path_to_save)

7 Session Information

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)
## 
## Matrix products: default
## BLAS:   /apps/R/3.6.0/lib64/R/lib/libRblas.so
## LAPACK: /home/devel/rmassoni/anaconda3/lib/libmkl_rt.so
## 
## locale:
##  [1] LC_CTYPE=C                 LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0    stringr_1.4.0    dplyr_1.0.4      purrr_0.3.4      readr_1.3.1      tidyr_1.1.0      tibble_3.0.1     ggplot2_3.3.0    tidyverse_1.3.0  Matrix_1.2-18    Seurat_3.2.0     BiocStyle_2.14.4
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_1.4-1      deldir_0.1-25         ellipsis_0.3.1        ggridges_0.5.2        rprojroot_1.3-2       fs_1.4.1              rstudioapi_0.11       spatstat.data_1.4-3   farver_2.0.3          leiden_0.3.3          listenv_0.8.0         ggrepel_0.8.2         fansi_0.4.1           lubridate_1.7.8       xml2_1.3.2            codetools_0.2-16      splines_3.6.0         knitr_1.28            polyclip_1.10-0       jsonlite_1.7.2        broom_0.5.6           ica_1.0-2             cluster_2.1.0         dbplyr_1.4.4          png_0.1-7             uwot_0.1.8            shiny_1.4.0.2         sctransform_0.2.1     BiocManager_1.30.10   compiler_3.6.0        httr_1.4.2            backports_1.1.7       assertthat_0.2.1      fastmap_1.0.1         lazyeval_0.2.2        cli_2.0.2             later_1.0.0           htmltools_0.4.0       tools_3.6.0           rsvd_1.0.3            igraph_1.2.5          gtable_0.3.0          glue_1.4.1            RANN_2.6.1            reshape2_1.4.4        rappdirs_0.3.1        Rcpp_1.0.6            spatstat_1.64-1       cellranger_1.1.0      vctrs_0.3.6           ape_5.3               nlme_3.1-148          lmtest_0.9-37        
##  [55] xfun_0.14             globals_0.12.5        rvest_0.3.5           mime_0.9              miniUI_0.1.1.1        lifecycle_0.2.0       irlba_2.3.3           goftest_1.2-2         future_1.17.0         MASS_7.3-51.6         zoo_1.8-8             scales_1.1.1          hms_0.5.3             promises_1.1.0        spatstat.utils_1.17-0 parallel_3.6.0        RColorBrewer_1.1-2    yaml_2.2.1            reticulate_1.16       pbapply_1.4-2         gridExtra_2.3         rpart_4.1-15          stringi_1.4.6         rlang_0.4.10          pkgconfig_2.0.3       evaluate_0.14         lattice_0.20-41       ROCR_1.0-11           tensor_1.5            labeling_0.3          patchwork_1.0.0       htmlwidgets_1.5.1     cowplot_1.0.0         tidyselect_1.1.0      here_0.1              RcppAnnoy_0.0.16      plyr_1.8.6            magrittr_1.5          bookdown_0.19         R6_2.4.1              generics_0.0.2        DBI_1.1.0             withr_2.4.1           pillar_1.4.4          haven_2.3.1           mgcv_1.8-31           fitdistrplus_1.1-1    survival_3.1-12       abind_1.4-5           future.apply_1.5.0    modelr_0.1.8          crayon_1.3.4          KernSmooth_2.23-17    plotly_4.9.2.1       
## [109] rmarkdown_2.2         readxl_1.3.1          grid_3.6.0            data.table_1.12.8     blob_1.2.1            reprex_0.3.0          digest_0.6.20         xtable_1.8-4          httpuv_1.5.3.1        munsell_0.5.0         viridisLite_0.3.0